#### Prepare the data
#### Jonne Kamphorst
#### May 2021




#### Load libraries ####
library(dplyr)
library(tidyverse)
library(cjoint) #for creating dataframe
library(stringr) #for reg expr
library(gdata) #for keep
library(readxl)
library(stringr)
library(Hmisc)







#### Load data frames for normal questions ####
## Cleaning part 1:
# cleaned in excel: Remove 2nd and 3th row and remove qualtrics' standard columns
data_1 <- read.csv("data/data_final_all/part1_choicetext.csv") #first part of the survey



# remove the columns that are for the pvda experiment. 
data_1 <- data_1 %>% select(-c(Q8.2_1, Q8.2_2, Q8.2_3, Q8.2_4, Q8.2_5,
                               Q8.2_6, Q8.2_8))

# Remove the attention checks
data_1 <- data_1 %>% select(-c(Q9.3,	`Q9.4_First.Click`, `Q9.4_Last.Click`, `Q9.4_Page.Submit`, `Q9.4_Click.Count`,
                               Q9.5, `Q9.6_First.Click`, 	`Q9.6_Last.Click`,	`Q9.6_Page.Submit`,	`Q9.6_Click.Count`,
                               Q9.7, 	`Q9.8_First.Click`,	`Q9.8_Last.Click`,	`Q9.8_Page.Submit`,
                               `Q9.8_Click.Count`,	Q9.10))

data_1 <- subset(data_1, gc==1) #drop the respondents who did not pass the attention checks or did not want to participate
data_1 <- data_1 %>% select(-gc) %>% distinct(rid, .keep_all=T) #drop column to that it doesn't merge on it later



## Cleaning part 2:
# Cleaned in excel: cleaned the same way as part 1 and also removed all conjoint questions and pvda questions
data_2 <- read.csv("data/data_final_all/part2_choicetext.csv") #has to be excel for the time variable
data_2 <- subset(data_2, gc==1) #people who didn't finish (none)
data_2 <- data_2 %>% select(-gc)


# extract the credits left from the url.
data_2$credit.left <- as.numeric(str_extract(data_2$Q_URL, "(?i)(?<=credrem\\D)\\d+"))


# Create absolute values from QV variables
data_2$QV1_abs <- abs(data_2$QV1)
data_2$QV2_abs <- abs(data_2$QV2)
data_2$QV3_abs <- abs(data_2$QV3)
data_2$QV4_abs <- abs(data_2$QV4)
data_2$QV5_abs <- abs(data_2$QV5)
data_2$QV6_abs <- abs(data_2$QV6)




## Check how many credits people used (just to see if it lines up with the URL data)
data_2$credits <- (data_2$QV1_abs)^2 +
  (data_2$QV2_abs)^2 +
  (data_2$QV3_abs)^2 +
  (data_2$QV4_abs)^2 +
  (data_2$QV5_abs)^2 +
  (data_2$QV6_abs)^2

data_2$credrem_manual <- 36 - data_2$credits

non_matching <- subset(data_2, !(credrem_manual %in% data_2$credit.left)) #should be 0, checks if credits left manual and from url are the same
rm(non_matching)





# Only keep the people from the second survey
qvsrp2 <- read.csv("data/data_final_all/qvsrp2.csv")
qvsrp2 <- qvsrp2 %>% select(user.id) %>% rename(userid = user.id)


data_2 <- inner_join(data_2, qvsrp2) #only keep respondents who did the second QV bit. 

if (anyNA(data_2$Duration..in.seconds.) == T |
    any(duplicated(data_2$userid)) == T){
  print("Error!")
} else {
  print("All good!")
}




data_2 <- data_2 %>% select(-c("StartDate", "EndDate", "Status", "IPAddress", #drop variables in both datasets for match
                            "Progress", "Finished",
                            "RecordedDate", "ResponseId", "RecipientLastName",
                            "RecipientFirstName", "RecipientEmail", "ExternalReference",
                            "LocationLatitude", "LocationLongitude", "DistributionChannel",
                            "UserLanguage", "rid"))


data_1 <- data_1 %>% select(-c("Duration..in.seconds."))

data_2$Duration..in.seconds.
final_data_part2 <- inner_join(data_2, data_1)
keep(final_data_part2, sure=T)


if (anyNA(final_data_part2$Duration..in.seconds.) == T |
    any(duplicated(final_data_part2$userid)) == T){
  print("Error!")
} else {
  print("All good!")
}



final_data_part2$Duration..in.seconds.

#remove the people who raced through the survey
mean(final_data_part2$Duration..in.seconds.)
min(final_data_part2$Duration..in.seconds.)
max(final_data_part2$Duration..in.seconds.)
describe(final_data_part2$Duration..in.seconds.)
final_data_part2 <- subset(final_data_part2, Duration..in.seconds. > 100) # at least 100 seconds in part 2






#### Load conjoint data frames ####


## load conjoint data for the *choice* of profiles using the hainmueller package
# Cleaned in excel: only conjoint columns, ID variable should be first column. 
responses_choice <- c("QPT4.2", "QPT4.3", "QPT4.4", "QPT4.5", "QPT4.7", "QPT4.10",
               "QPT4.13", "QPT4.16", "QPT4.19", "QPT4.23") #variable names for answers. 


conjoint_df_choice <- read.qualtrics(filename="data/data_final_all/p2_conjoint.csv", responses = responses_choice, 
                              respondentID = "rid", letter = "F", new.format = T) #user ID needs to be the first column!



# drop the columns I don't need #
drop <- c("respondentIndex", "We.moeten.klimaatverandering.hard.aanpakken.rowpos",
          "Scholen.mogen.homoseksualiteit.afkeuren.rowpos", "Megastallen.moeten.verboden.worden.rowpos",
          "Nederland.moet.meer.vluchtelingen.opnemen.rowpos", "Het.minimumloon.moet.omhoog.rowpos",
          "Er.moet.een.referendum.komen.over.de.EU.rowpos", "Partij.rowpos")
conjoint_df_choice <- conjoint_df_choice %>% select(-drop)
rm(drop)

conjoint_df_choice$task <- as.numeric(conjoint_df_choice$task) #change type and names so you can join later
conjoint_df_choice$profile <- as.numeric(conjoint_df_choice$profile)
conjoint_df_choice <- conjoint_df_choice %>% rename(rid = respondent)






## Load conjoint data for the *mechanism* variables
data_cjoint_mech <- read.csv("data/data_final_all/p2_conjointmech.csv") #

#select the right columns
data_cjoint_mech <- data_cjoint_mech %>% select(rid, # response id
                                                QPT4.11_1, QPT4.11_2, QPT4.12_1, QPT4.12_2, #profile 6
                                                QPT4.14_1, QPT4.14_2, QPT4.15_1, QPT4.15_2, #profile 7
                                                QPT4.17_1, QPT4.17_2, QPT4.18_1, QPT4.18_2, #profile 8
                                                QPT4.20_1, QPT4.20_4, QPT4.21_1, QPT4.21_4, #profile 9
                                                QPT4.24_1, QPT4.24_4, QPT4.25_1, QPT4.25_4) #profile 10
                                                
#rename to make it possible to make the data longer
data_cjoint_mech <- data_cjoint_mech %>% rename(`6_1_position` = QPT4.11_1, 
                                                `6_2_position` = QPT4.11_2, 
                                                `6_1_clarity` = QPT4.12_1, 
                                                `6_2_clarity` = QPT4.12_2, 
                                                `7_1_position` = QPT4.14_1, 
                                                `7_2_position` = QPT4.14_2, 
                                                `7_1_clarity` = QPT4.15_1, 
                                                `7_2_clarity` = QPT4.15_2,
                                                `8_1_position` = QPT4.17_1, 
                                                `8_2_position` = QPT4.17_2, 
                                                `8_1_clarity` = QPT4.18_1, 
                                                `8_2_clarity` = QPT4.18_2,
                                                `9_1_position` = QPT4.20_1, 
                                                `9_2_position` = QPT4.20_4, 
                                                `9_1_clarity` = QPT4.21_1, 
                                                `9_2_clarity` = QPT4.21_4,
                                                `10_1_position` = QPT4.24_1, 
                                                `10_2_position` = QPT4.24_4, 
                                                `10_1_clarity` = QPT4.25_1, 
                                                `10_2_clarity` = QPT4.25_4)

                                                

# First turn into wide for position
data_position <- data_cjoint_mech %>% select(rid, ends_with("position")) #only take position
data_position <- data_position %>% pivot_longer(!rid, names_to="task_profile", values_to="position") #turn into wider
data_position <- data_position %>% mutate(task_profile = str_remove_all(task_profile, "_position")) # remove position ending
data_position <- data_position %>% separate(task_profile, c("task", "profile")) #separate the task and profile


# And for clarity
data_clarity <- data_cjoint_mech %>% select(rid, ends_with("clarity")) #only take clarity
data_clarity <- data_clarity %>% pivot_longer(!rid, names_to="task_profile", values_to="clarity") #turn into wider
data_clarity <- data_clarity %>% mutate(task_profile = str_remove_all(task_profile, "_clarity")) # remove position ending
data_clarity <- data_clarity %>% separate(task_profile, c("task", "profile")) #separate the task and profile

#Join the data together
data_cjoint_mech <- full_join(data_position, data_clarity)
data_cjoint_mech$task <- as.numeric(data_cjoint_mech$task) #change type so can join
data_cjoint_mech$profile <- as.numeric(data_cjoint_mech$profile)
rm(data_position, data_clarity)



#### Create the final data frame wide format ####
data_cjoint <- full_join(data_cjoint_mech, conjoint_df_choice) #join mechanism and choice outcomes

## Merge other part 2 and part 1 data in
final_data_part2_wide <- inner_join(data_cjoint, final_data_part2)





# Rename column names
final_data_part2_wide <- final_data_part2_wide %>% rename(`EU`= Er.moet.een.referendum.komen.over.de.EU,
                                      `wage` = Het.minimumloon.moet.omhoog,
                                      `immi` = Nederland.moet.meer.vluchtelingen.opnemen,
                                      `farms` = Megastallen.moeten.verboden.worden,
                                      `lgbt` = Scholen.mogen.homoseksualiteit.afkeuren,
                                      `climate` = We.moeten.klimaatverandering.hard.aanpakken) %>%
  relocate(selected, .after=profile)





# Save the wide data
#write_rds(final_data_part2_wide, "data/final_data_part2_wide.rds")






## Turn into a longer dataframe where the attributes are in a single column
# Rename column names
data_cjoint <- data_cjoint %>% rename(`EU`= Er.moet.een.referendum.komen.over.de.EU,
                                      `wage` = Het.minimumloon.moet.omhoog,
                                      `immi` = Nederland.moet.meer.vluchtelingen.opnemen,
                                      `farms` = Megastallen.moeten.verboden.worden,
                                      `lgbt` = Scholen.mogen.homoseksualiteit.afkeuren,
                                      `climate` = We.moeten.klimaatverandering.hard.aanpakken) %>%
  relocate(selected, .after=profile)


cols <- c("rid", "task", "profile", "selected", "Partij", "position", "clarity")
final_data_part2_l <- data_cjoint %>% pivot_longer(!cols, names_to = "issue", values_to = "strategy")
rm(cols)



## Merge other part 2 and part 1 data in
final_data_part2_l <- inner_join(final_data_part2_l, final_data_part2)
#write_rds(final_data_part2_l, "data/final_data_part2_long.rds")
keep(final_data_part2, sure=T)
rm(final_data_part2)























